{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "P-AhVYeBWgQ3"
   },
   "outputs": [],
   "source": [
    "# NOTE: PLEASE MAKE SURE YOU ARE RUNNING THIS IN A PYTHON3 ENVIRONMENT\n",
    "\n",
    "import tensorflow as tf\n",
    "print(tf.__version__)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "hd4c-LEHCFZN"
   },
   "outputs": [],
   "source": [
    "# Uncomment and run this if you don't have TensorFlow 2.0x [Check for latest 2.0 instructions at https://www.tensorflow.org/versions/r2.0/api_docs/python/tf]\n",
    "#!pip install tensorflow==2.0.0-beta0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wm9S3T8-9H4q"
   },
   "outputs": [],
   "source": [
    "# Double check TF 2.0x is installed. If you ran the above block, there was a \n",
    "# 'reset all runtimes' button at the bottom that you needed to press\n",
    "#import tensorflow as tf\n",
    "#print(tf.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "_IoM4VFxWpMR"
   },
   "outputs": [],
   "source": [
    "# If the import fails, run this\n",
    "# !pip install -q tensorflow-datasets \n",
    "\n",
    "import tensorflow_datasets as tfds\n",
    "imdb, info = tfds.load(\"imdb_reviews/subwords8k\", with_info=True, as_supervised=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wHQ2Ko0zl7M4"
   },
   "outputs": [],
   "source": [
    "train_data, test_data = imdb['train'], imdb['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "fqGRSe_eCdOz"
   },
   "outputs": [],
   "source": [
    "tokenizer = info.features['text'].encoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "fPl2BXhYEHRP"
   },
   "outputs": [],
   "source": [
    "sample_string = 'TensorFlow, from basics to mastery'\n",
    "\n",
    "tokenized_string = tokenizer.encode(sample_string)\n",
    "print ('Tokenized string is {}'.format(tokenized_string))\n",
    "\n",
    "original_string = tokenizer.decode(tokenized_string)\n",
    "print ('The original string: {}'.format(original_string))\n",
    "\n",
    "# expected output:\n",
    "# Tokenized string is [6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]\n",
    "# The original string: TensorFlow, from basics to mastery"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "_3t7vvNLEZml"
   },
   "outputs": [],
   "source": [
    "for ts in tokenized_string:\n",
    "  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BUFFER_SIZE = 25000\n",
    "BATCH_SIZE = 1\n",
    "\n",
    "train_data = train_data.shuffle(BUFFER_SIZE)\n",
    "train_data = train_data.padded_batch(BATCH_SIZE)\n",
    "test_data = test_data.padded_batch(BATCH_SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "5NEpdhb8AxID"
   },
   "outputs": [],
   "source": [
    "embedding_dim = 64\n",
    "model = tf.keras.Sequential([\n",
    "    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),\n",
    "    tf.keras.layers.GlobalAveragePooling1D(),\n",
    "    tf.keras.layers.Dense(6, activation='relu'),\n",
    "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
    "])\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "fkt8c5dNuUlT"
   },
   "outputs": [],
   "source": [
    "num_epochs = 10\n",
    "\n",
    "model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
    "\n",
    "history = model.fit(train_data, epochs=num_epochs, validation_data=test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "-_rMnm7WxQGT"
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "def plot_graphs(history, string):\n",
    "  plt.plot(history.history[string])\n",
    "  plt.plot(history.history['val_'+string])\n",
    "  plt.xlabel(\"Epochs\")\n",
    "  plt.ylabel(string)\n",
    "  plt.legend([string, 'val_'+string])\n",
    "  plt.show()\n",
    "  \n",
    "plot_graphs(history, \"accuracy\")\n",
    "plot_graphs(history, \"loss\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "qACq5FLzTW4A"
   },
   "outputs": [],
   "source": [
    "e = model.layers[0]\n",
    "weights = e.get_weights()[0]\n",
    "print(weights.shape) # shape: (vocab_size, embedding_dim)\n",
    "\n",
    "import io\n",
    "\n",
    "out_v = io.open('vecs.tsv', 'w', encoding='utf-8')\n",
    "out_m = io.open('meta.tsv', 'w', encoding='utf-8')\n",
    "for word_num in range(1, tokenizer.vocab_size):\n",
    "  word = tokenizer.decode([word_num])\n",
    "  embeddings = weights[word_num]\n",
    "  out_m.write(word + \"\\n\")\n",
    "  out_v.write('\\t'.join([str(x) for x in embeddings]) + \"\\n\")\n",
    "out_v.close()\n",
    "out_m.close()\n",
    "\n",
    "\n",
    "# try:\n",
    "#   from google.colab import files\n",
    "# except ImportError:\n",
    "#   pass\n",
    "# else:\n",
    "#   files.download('vecs.tsv')\n",
    "#   files.download('meta.tsv')"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Course 3 - Week 2 - Lesson 3.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
